SamruddhiMistry-Project9
import math as math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats
from sklearn import datasets, linear_model, metrics
import seaborn as sns
plt.style.use('seaborn')
import statsmodels.api as sm
from statsmodels.formula.api import ols
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.stats.api as sms
from sklearn.linear_model import LinearRegression
import seaborn as sns
from statsmodels.stats.anova import anova_lm
from statsmodels.datasets import get_rdataset
sns.set()
#Importing data
data = pd.read_csv("C:/Users/Deepali Paul/Desktop/MSBA 320/GlobalTemp.csv", parse_dates=['dt'])
list(data)
['dt', 'LandAverageTemperature', 'LandAverageTemperatureUncertainty', 'LandMaxTemperature', 'LandMaxTemperatureUncertainty', 'LandMinTemperature', 'LandMinTemperatureUncertainty', 'LandAndOceanAverageTemperature', 'LandAndOceanAverageTemperatureUncertainty']
#head
data.head(10)
| dt | LandAverageTemperature | LandAverageTemperatureUncertainty | LandMaxTemperature | LandMaxTemperatureUncertainty | LandMinTemperature | LandMinTemperatureUncertainty | LandAndOceanAverageTemperature | LandAndOceanAverageTemperatureUncertainty | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 1750-01-01 | 3.034 | 3.574 | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | 1750-02-01 | 3.083 | 3.702 | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | 1750-03-01 | 5.626 | 3.076 | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | 1750-04-01 | 8.490 | 2.451 | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | 1750-05-01 | 11.573 | 2.072 | NaN | NaN | NaN | NaN | NaN | NaN |
| 5 | 1750-06-01 | 12.937 | 1.724 | NaN | NaN | NaN | NaN | NaN | NaN |
| 6 | 1750-07-01 | 15.868 | 1.911 | NaN | NaN | NaN | NaN | NaN | NaN |
| 7 | 1750-08-01 | 14.750 | 2.231 | NaN | NaN | NaN | NaN | NaN | NaN |
| 8 | 1750-09-01 | 11.413 | 2.637 | NaN | NaN | NaN | NaN | NaN | NaN |
| 9 | 1750-10-01 | 6.367 | 2.668 | NaN | NaN | NaN | NaN | NaN | NaN |
## Convert the 'dt' column to datetime format
data['dt'] = pd.to_datetime(data['dt'])
## Set 'dt' column as index
data.set_index('dt', inplace=True)
## Missing values
data.dropna(inplace=True)
## Changing the data and filtering the data according to the data
## Select data from '1850-01-01' to '12/1/2015'
data = data.loc['1850-01-01':'2015-12-01']
## Save the new dataset to an CSV file
data.to_csv('new_dataset.csv')
#head
data.head(10)
| LandAverageTemperature | LandAverageTemperatureUncertainty | LandMaxTemperature | LandMaxTemperatureUncertainty | LandMinTemperature | LandMinTemperatureUncertainty | LandAndOceanAverageTemperature | LandAndOceanAverageTemperatureUncertainty | |
|---|---|---|---|---|---|---|---|---|
| dt | ||||||||
| 1850-01-01 | 0.749 | 1.105 | 8.242 | 1.738 | -3.206 | 2.822 | 12.833 | 0.367 |
| 1850-02-01 | 3.071 | 1.275 | 9.970 | 3.007 | -2.291 | 1.623 | 13.588 | 0.414 |
| 1850-03-01 | 4.954 | 0.955 | 10.347 | 2.401 | -1.905 | 1.410 | 14.043 | 0.341 |
| 1850-04-01 | 7.217 | 0.665 | 12.934 | 1.004 | 1.018 | 1.329 | 14.667 | 0.267 |
| 1850-05-01 | 10.004 | 0.617 | 15.655 | 2.406 | 3.811 | 1.347 | 15.507 | 0.249 |
| 1850-06-01 | 13.150 | 0.614 | 18.946 | 2.817 | 7.106 | 0.857 | 16.353 | 0.245 |
| 1850-07-01 | 14.492 | 0.614 | 19.233 | 2.840 | 8.014 | 0.786 | 16.783 | 0.238 |
| 1850-08-01 | 14.039 | 0.802 | 18.477 | 2.079 | 7.406 | 1.086 | 16.718 | 0.280 |
| 1850-09-01 | 11.505 | 0.675 | 15.846 | 2.692 | 4.533 | 1.798 | 15.886 | 0.254 |
| 1850-10-01 | 8.091 | 0.863 | 13.189 | 2.338 | 2.013 | 2.133 | 14.831 | 0.297 |
## Tail
data.tail(10)
| LandAverageTemperature | LandAverageTemperatureUncertainty | LandMaxTemperature | LandMaxTemperatureUncertainty | LandMinTemperature | LandMinTemperatureUncertainty | LandAndOceanAverageTemperature | LandAndOceanAverageTemperatureUncertainty | |
|---|---|---|---|---|---|---|---|---|
| dt | ||||||||
| 2015-03-01 | 6.740 | 0.060 | 12.659 | 0.096 | 0.894 | 0.079 | 15.193 | 0.061 |
| 2015-04-01 | 9.313 | 0.088 | 15.224 | 0.137 | 3.402 | 0.147 | 15.962 | 0.061 |
| 2015-05-01 | 12.312 | 0.081 | 18.181 | 0.117 | 6.313 | 0.153 | 16.774 | 0.058 |
| 2015-06-01 | 14.505 | 0.068 | 20.364 | 0.133 | 8.627 | 0.168 | 17.390 | 0.057 |
| 2015-07-01 | 15.051 | 0.086 | 20.904 | 0.109 | 9.326 | 0.225 | 17.611 | 0.058 |
| 2015-08-01 | 14.755 | 0.072 | 20.699 | 0.110 | 9.005 | 0.170 | 17.589 | 0.057 |
| 2015-09-01 | 12.999 | 0.079 | 18.845 | 0.088 | 7.199 | 0.229 | 17.049 | 0.058 |
| 2015-10-01 | 10.801 | 0.102 | 16.450 | 0.059 | 5.232 | 0.115 | 16.290 | 0.062 |
| 2015-11-01 | 7.433 | 0.119 | 12.892 | 0.093 | 2.157 | 0.106 | 15.252 | 0.063 |
| 2015-12-01 | 5.518 | 0.100 | 10.725 | 0.154 | 0.287 | 0.099 | 14.774 | 0.062 |
data.info()
<class 'pandas.core.frame.DataFrame'> DatetimeIndex: 1992 entries, 1850-01-31 to 2015-12-31 Freq: M Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 LandAverageTemperature 1992 non-null float64 1 LandAverageTemperatureUncertainty 1992 non-null float64 2 LandMaxTemperature 1992 non-null float64 3 LandMaxTemperatureUncertainty 1992 non-null float64 4 LandMinTemperature 1992 non-null float64 5 LandMinTemperatureUncertainty 1992 non-null float64 6 LandAndOceanAverageTemperature 1992 non-null float64 7 LandAndOceanAverageTemperatureUncertainty 1992 non-null float64 8 10yr_ma 1873 non-null float64 dtypes: float64(9) memory usage: 155.6 KB
## Ploting descriptive histogram plots for visualization
_ = plt.hist(data['LandAverageTemperature'])
_ = plt.xlabel('LandAverageTemperature')
_ = plt.ylabel('Frequency')
plt.show()
_ = plt.hist(data['LandAndOceanAverageTemperature'])
_ = plt.xlabel('LandAndOceanAverageTemperature')
_ = plt.ylabel('Frequency')
plt.show()
_ = plt.hist(data['LandMaxTemperature'])
_ = plt.xlabel('LandMaxTemperature')
_ = plt.ylabel('Frequency')
plt.show()
## Basic plot of the data
data.plot()
<AxesSubplot:xlabel='dt'>
## Plot the time series
data['LandAverageTemperature'].plot()
plt.show()
## Calculate the rolling mean and plot it
rolling_mean = data['LandAverageTemperature'].rolling(window=12).mean()
rolling_mean.plot()
plt.show()
## Import the library to decompose
from statsmodels.tsa.seasonal import seasonal_decompose
## Decompose the data for LandAverageTemperature
result = seasonal_decompose(data['LandAverageTemperature'], model='multiplicative', period=12)
result.plot()
plt.show()
## Plot the time series
data['LandAndOceanAverageTemperature'].plot()
plt.show()
## Calculate the rolling mean and plot it
rolling_mean = data['LandAndOceanAverageTemperature'].rolling(window=16).mean()
rolling_mean.plot()
plt.show()
## Decompose the data for LandAndOceanAverageTemperature
result = seasonal_decompose(data['LandAndOceanAverageTemperature'], model='multiplicative', period=12)
result.plot()
plt.show()
## Plot the time series
data['LandMaxTemperature'].plot()
plt.show()
## Calculate the rolling mean and plot it
rolling_mean = data['LandMaxTemperature'].rolling(window=16).mean()
rolling_mean.plot()
plt.show()
## Decompose the data for LandMaxTemperature
result = seasonal_decompose(data['LandMaxTemperature'], model='multiplicative', period=12)
result.plot()
plt.show()
## Below are 2 types of time series data visualization
## Plot Land Average Temperature vs time
plt.figure(figsize=(12,6))
plt.plot(data['LandAverageTemperature'])
plt.title('Land Average Temperature vs Time')
plt.xlabel('Year')
plt.ylabel('Temperature (°C)')
plt.show()
## Plot the time series data
plt.figure(figsize=(15,6))
plt.plot(data)
plt.xlabel('Year')
plt.ylabel('LandAverageTemperature')
plt.title('Land Average Temperature vs Time')
plt.show()
## Perform a Dickey-Fuller test for stationarity to perform statistical tests
from statsmodels.tsa.stattools import adfuller
adf_result = adfuller(data['LandAverageTemperature'])
print('ADF Statistic: {:.4f}'.format(adf_result[0]))
print('p-value: {:.4f}'.format(adf_result[1]))
print('Critical Values:')
for key, value in adf_result[4].items():
print('\t{}: {:.4f}'.format(key, value))
ADF Statistic: -1.4553 p-value: 0.5555 Critical Values: 1%: -3.4337 5%: -2.8630 10%: -2.5676
## Plot Land and Ocean Average Temperature vs time
plt.figure(figsize=(12,6))
plt.plot(data['LandAndOceanAverageTemperature'])
plt.title('Land and Ocean Average Temperature vs Time')
plt.xlabel('Year')
plt.ylabel('Temperature (°C)')
plt.show()
## Plot the time series data
plt.figure(figsize=(15,6))
plt.plot(data)
plt.xlabel('Year')
plt.ylabel('LandAndOceanAverageTemperature')
plt.title('LandAndOceanAverageTemperature vs Time')
plt.show()
## Perform a Dickey-Fuller test for stationarity to perform statistical tests
adf_result = adfuller(data['LandAndOceanAverageTemperature'])
print('ADF Statistic: {:.4f}'.format(adf_result[0]))
print('p-value: {:.4f}'.format(adf_result[1]))
print('Critical Values:')
for key, value in adf_result[4].items():
print('\t{}: {:.4f}'.format(key, value))
ADF Statistic: -1.1353 p-value: 0.7008 Critical Values: 1%: -3.4337 5%: -2.8630 10%: -2.5676
## Plot Land Maximum Temperature vs time
plt.figure(figsize=(12,6))
plt.plot(data['LandMaxTemperature'])
plt.title('Land Maximum Temperature vs Time')
plt.xlabel('Year')
plt.ylabel('Temperature (°C)')
plt.show()
## Plot the time series data
plt.figure(figsize=(15,6))
plt.plot(data)
plt.xlabel('Year')
plt.ylabel('LandMaxTemperature')
plt.title('LandMaxTemperature vs Time')
plt.show()
## Perform a Dickey-Fuller test for stationarity to perform statistical tests
adf_result = adfuller(data['LandMaxTemperature'])
print('ADF Statistic: {:.4f}'.format(adf_result[0]))
print('p-value: {:.4f}'.format(adf_result[1]))
print('Critical Values:')
for key, value in adf_result[4].items():
print('\t{}: {:.4f}'.format(key, value))
ADF Statistic: -2.6571 p-value: 0.0818 Critical Values: 1%: -3.4337 5%: -2.8630 10%: -2.5676
# Perform autocorrelation and partial autocorrelation analysis for LandAverageTemperature
fig, ax = plt.subplots(2,1,figsize=(12,8))
sm.graphics.tsa.plot_acf(data['LandAverageTemperature'], lags=50, ax=ax[0])
sm.graphics.tsa.plot_pacf(data['LandAverageTemperature'], lags=50, ax=ax[1])
plt.show()
C:\Users\Deepali Paul\anaconda3\lib\site-packages\statsmodels\graphics\tsaplots.py:348: FutureWarning: The default method 'yw' can produce PACF values outside of the [-1,1] interval. After 0.13, the default will change tounadjusted Yule-Walker ('ywm'). You can use this method now by setting method='ywm'.
warnings.warn(
# Perform autocorrelation and partial autocorrelation analysis for LandAndOceanAverageTemperature
fig, ax = plt.subplots(2,1,figsize=(12,8))
sm.graphics.tsa.plot_acf(data['LandAndOceanAverageTemperature'], lags=50, ax=ax[0])
sm.graphics.tsa.plot_pacf(data['LandAndOceanAverageTemperature'], lags=50, ax=ax[1])
plt.show()
# Perform autocorrelation and partial autocorrelation analysis for LandMaxTemperature
fig, ax = plt.subplots(2,1,figsize=(12,8))
sm.graphics.tsa.plot_acf(data['LandMaxTemperature'], lags=50, ax=ax[0])
sm.graphics.tsa.plot_pacf(data['LandMaxTemperature'], lags=50, ax=ax[1])
plt.show()
# Perform ARIMA modeling for LandAverageTemperature
model = sm.tsa.ARIMA(data['LandAverageTemperature'], order=(1,1,1))
results = model.fit()
print(results.summary())
# Forecast future values
forecast = results.forecast(steps=10)
print(forecast)
SARIMAX Results
==================================================================================
Dep. Variable: LandAverageTemperature No. Observations: 1992
Model: ARIMA(1, 1, 1) Log Likelihood -3098.904
Date: Sun, 19 Mar 2023 AIC 6203.808
Time: 19:49:21 BIC 6220.597
Sample: 01-31-1850 HQIC 6209.974
- 12-31-2015
Covariance Type: opg
==============================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------
ar.L1 0.7470 0.020 38.032 0.000 0.708 0.785
ma.L1 0.3587 0.026 13.806 0.000 0.308 0.410
sigma2 1.3156 0.047 27.946 0.000 1.223 1.408
===================================================================================
Ljung-Box (L1) (Q): 43.73 Jarque-Bera (JB): 116.05
Prob(Q): 0.00 Prob(JB): 0.00
Heteroskedasticity (H): 0.78 Skew: 0.59
Prob(H) (two-sided): 0.00 Kurtosis: 2.88
===================================================================================
Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
2016-01-31 4.507555
2016-02-29 3.752767
2016-03-31 3.188949
2016-04-30 2.767785
2016-05-31 2.453180
2016-06-30 2.218175
2016-07-31 2.042629
2016-08-31 1.911498
2016-09-30 1.813546
2016-10-31 1.740376
Freq: M, Name: predicted_mean, dtype: float64
# Perform ARIMA modeling for LandAndOceanAverageTemperature
model = sm.tsa.ARIMA(data['LandAndOceanAverageTemperature'], order=(1,1,1))
results = model.fit()
print(results.summary())
# Forecast future values
forecast = results.forecast(steps=10)
print(forecast)
SARIMAX Results
==========================================================================================
Dep. Variable: LandAndOceanAverageTemperature No. Observations: 1992
Model: ARIMA(1, 1, 1) Log Likelihood -723.937
Date: Sun, 19 Mar 2023 AIC 1453.873
Time: 19:49:08 BIC 1470.662
Sample: 01-31-1850 HQIC 1460.039
- 12-31-2015
Covariance Type: opg
==============================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------
ar.L1 0.7380 0.020 37.451 0.000 0.699 0.777
ma.L1 0.3351 0.026 12.763 0.000 0.284 0.387
sigma2 0.1211 0.004 27.838 0.000 0.113 0.130
===================================================================================
Ljung-Box (L1) (Q): 36.97 Jarque-Bera (JB): 44.17
Prob(Q): 0.00 Prob(JB): 0.00
Heteroskedasticity (H): 0.77 Skew: 0.35
Prob(H) (two-sided): 0.00 Kurtosis: 2.78
===================================================================================
Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
2016-01-31 14.563972
2016-02-29 14.408977
2016-03-31 14.294595
2016-04-30 14.210183
2016-05-31 14.147890
2016-06-30 14.101919
2016-07-31 14.067993
2016-08-31 14.042957
2016-09-30 14.024481
2016-10-31 14.010846
Freq: M, Name: predicted_mean, dtype: float64
# Perform ARIMA modeling for LandMaxTemperature
model = sm.tsa.ARIMA(data['LandMaxTemperature'], order=(1,1,1))
results = model.fit()
print(results.summary())
# Forecast future values
forecast = results.forecast(steps=10)
print(forecast)
SARIMAX Results
==============================================================================
Dep. Variable: LandMaxTemperature No. Observations: 1992
Model: ARIMA(1, 1, 1) Log Likelihood -3311.370
Date: Sun, 19 Mar 2023 AIC 6628.740
Time: 19:49:59 BIC 6645.529
Sample: 01-31-1850 HQIC 6634.906
- 12-31-2015
Covariance Type: opg
==============================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------
ar.L1 0.7236 0.020 35.685 0.000 0.684 0.763
ma.L1 0.2928 0.026 11.111 0.000 0.241 0.344
sigma2 1.6288 0.055 29.651 0.000 1.521 1.736
===================================================================================
Ljung-Box (L1) (Q): 22.95 Jarque-Bera (JB): 65.24
Prob(Q): 0.00 Prob(JB): 0.00
Heteroskedasticity (H): 0.61 Skew: 0.42
Prob(H) (two-sided): 0.00 Kurtosis: 3.26
===================================================================================
Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
2016-01-31 9.418161
2016-02-29 8.472574
2016-03-31 7.788377
2016-04-30 7.293314
2016-05-31 6.935102
2016-06-30 6.675911
2016-07-31 6.488369
2016-08-31 6.352669
2016-09-30 6.254481
2016-10-31 6.183435
Freq: M, Name: predicted_mean, dtype: float64
## We can observe the following:
## The time series plot shows that 3 temperature variables exhibit an overall increasing trend over time, with some seasonal fluctuations.
## The decomposition plots for the 'LandAverageTemperature' variable show that the trend component accounts for most of the
# variation in the data, followed by the seasonal component. The residual component appears to be relatively small and random.
## The autocorrelation and partial autocorrelation plots for the 'LandAverageTemperature' variable show some significant
# lags in the data, which may indicate the presence of some correlation or seasonality in the data.
## The ARIMA model summary for the 'LandAverageTemperature' variable shows that the model has a significant AR(1) coefficient
# indicating that the current value of the variable is somewhat dependent on its past value.
## The forecast values for the 'LandAverageTemperature' variable show an overall increasing trend over the next 10 time periods,
# consistent with the overall trend observed in the original time series plot.
## The plots of the 'LandAverageTemperature', 'LandAndOceanAverageTemperature', and 'LandMaxTemperature' columns from the
#'globaltemperatures' dataset show a general increasing trend in global temperatures from 1850 to 2015.
## There is some variability in temperature from year to year, but the overall trend is upwards.
## The 'LandMaxTemperature' plot also shows a higher degree of variability compared to the other two metrics.
## The LandMaxTemperature plot shows a larger degree of variability, which could be an indication of more extreme weather events.
## These observations suggest that global warming is a real and ongoing phenomenon, with significant implications
# for the environment and human society.
## Calculate the 10-year moving average for LandAndOceanAverageTemperature
rolling_avg = data['LandAndOceanAverageTemperature'].rolling(window=10).mean()
## Plot the original data and the rolling average on the same graph
plt.figure(figsize=(10,5))
sns.lineplot(x=data.index, y=data['LandAndOceanAverageTemperature'], label='Original Data')
sns.lineplot(x=data.index, y=rolling_avg, label='10-Year Moving Average')
plt.title('Land and Ocean Average Temperature with 10-Year Moving Average')
plt.xlabel('Year')
plt.ylabel('Temperature (°C)')
plt.legend()
plt.show()
## Interpretation of results:
## The plot of the 'LandAndOceanAverageTemperature' column with the 10-year moving average
## shows a clearer picture of the long-term trend in global temperatures, smoothing out the year-to-year fluctuations.
## The plot demonstrates that the trend of increasing temperatures over time has been relatively steady
# since the late 19th century, with a more pronounced upward trend in recent decades.
## From the plot, we can see that there has been a steady increase in global temperature over the past century.
## There are some fluctuations in the data, but the overall trend is clear.
## The 10-year moving average line shows a relatively smooth upward trend with a few minor dips,
# suggesting that the global temperature has been increasing gradually over the past several decades.
## The moving average plot also shows that there were periods of cooling or slower warming, such as in the mid-20th century,
## but the overall trend remains upward. This plot confirms the previous analysis of the trend in global temperatures,
## and it helps to better identify long-term trends by removing short-term fluctuations.